library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ──────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.6 ✔ purrr 0.3.4
✔ tibble 3.1.6 ✔ dplyr 1.0.9
✔ tidyr 1.2.0 ✔ stringr 1.4.0
✔ readr 2.1.2 ✔ forcats 0.5.1
Warning: package ‘ggplot2’ was built under R version 4.2.1Warning: package ‘dplyr’ was built under R version 4.2.1── Conflicts ─────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(janitor)
Warning: package ‘janitor’ was built under R version 4.2.1
Attaching package: ‘janitor’
The following objects are masked from ‘package:stats’:
chisq.test, fisher.test
glimpse(national_incidence)
Rows: 3,400
Columns: 61
$ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2…
$ country <chr> "S92000003", "S92000003", "S92000003", "S92000003", "S92000003", "S92000…
$ cancer_site_icd10code <chr> "C00-C97, excluding C44", "C00-C97, excluding C44", "C00-C97, excluding …
$ cancer_site <chr> "All cancer types", "All cancer types", "All cancer types", "All cancer …
$ sex <chr> "All", "All", "All", "All", "All", "All", "All", "All", "All", "All", "A…
$ sex_qf <chr> "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d…
$ year <dbl> 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, …
$ incidences_age_under5 <dbl> 58, 57, 59, 56, 44, 56, 62, 58, 51, 54, 51, 54, 61, 51, 44, 52, 54, 57, …
$ incidences_age5to9 <dbl> 19, 38, 25, 28, 35, 31, 39, 43, 41, 22, 21, 31, 38, 23, 18, 31, 32, 23, …
$ incidences_age10to14 <dbl> 28, 38, 35, 31, 27, 36, 51, 38, 46, 36, 26, 21, 31, 32, 32, 33, 32, 30, …
$ incidences_age15to19 <dbl> 55, 57, 57, 51, 47, 51, 61, 78, 55, 60, 60, 67, 59, 67, 56, 61, 58, 65, …
$ incidences_age20to24 <dbl> 106, 113, 89, 90, 99, 113, 106, 96, 99, 119, 113, 101, 115, 118, 133, 11…
$ incidences_age25to29 <dbl> 213, 194, 176, 179, 187, 156, 169, 144, 157, 156, 155, 175, 203, 205, 20…
$ incidences_age30to34 <dbl> 302, 316, 305, 302, 300, 280, 296, 298, 257, 248, 266, 287, 259, 245, 29…
$ incidences_age35to39 <dbl> 459, 426, 456, 421, 441, 420, 434, 454, 448, 512, 447, 446, 487, 420, 42…
$ incidences_age40to44 <dbl> 614, 608, 647, 631, 659, 661, 729, 675, 710, 755, 705, 747, 834, 802, 81…
$ incidences_age45to49 <dbl> 1120, 1068, 912, 924, 983, 922, 959, 1042, 1062, 1026, 1107, 1156, 1184,…
$ incidences_age50to54 <dbl> 1633, 1577, 1621, 1665, 1711, 1675, 1647, 1539, 1680, 1533, 1622, 1713, …
$ incidences_age55to59 <dbl> 2169, 2104, 2096, 2080, 2118, 2217, 2244, 2292, 2425, 2542, 2449, 2408, …
$ incidences_age60to64 <dbl> 2906, 2791, 2757, 2839, 2922, 2890, 2985, 2958, 3147, 2973, 3086, 3464, …
$ incidences_age65to69 <dbl> 3972, 3645, 3635, 3509, 3457, 3559, 3635, 3663, 3906, 3782, 3857, 4043, …
$ incidences_age70to74 <dbl> 4490, 4143, 4248, 4124, 4124, 4110, 4135, 4234, 4105, 4219, 4262, 4122, …
$ incidences_age75to79 <dbl> 3853, 4000, 3919, 4022, 4019, 3972, 4033, 4086, 4017, 4030, 4065, 4245, …
$ incidences_age80to84 <dbl> 2972, 2767, 2579, 2514, 2717, 2694, 2976, 3127, 3278, 3114, 3233, 3168, …
$ incidences_age85to89 <dbl> 1607, 1571, 1489, 1711, 1574, 1608, 1471, 1515, 1552, 1580, 1737, 1862, …
$ incidences_age90and_over <dbl> 678, 685, 657, 670, 707, 746, 739, 798, 811, 772, 816, 820, 764, 807, 87…
$ incidences_all_ages <dbl> 27254, 26198, 25762, 25847, 26171, 26197, 26771, 27138, 27847, 27533, 28…
$ incidence_rate_age_under5 <dbl> 18.70467, 18.83021, 19.90614, 19.28966, 15.53601, 20.27069, 22.98135, 21…
$ incidence_rate_age5to9 <dbl> 5.872535, 11.736545, 7.795399, 8.792589, 11.169904, 10.136914, 12.985240…
$ incidence_rate_age10to14 <dbl> 8.772837, 11.868732, 10.823046, 9.549657, 8.362136, 11.148168, 15.789767…
$ incidence_rate_age15to19 <dbl> 17.71639, 18.08181, 17.90673, 16.05414, 14.78471, 16.05768, 19.14663, 24…
$ incidence_rate_age20to24 <dbl> 31.13554, 35.08750, 28.76136, 29.27953, 31.97623, 35.82809, 32.79947, 29…
$ incidence_rate_age25to29 <dbl> 54.41669, 51.18869, 48.44068, 51.72992, 56.67148, 49.54190, 56.07390, 49…
$ incidence_rate_age30to34 <dbl> 74.26954, 77.87471, 75.81765, 76.16473, 77.57290, 73.44513, 79.86294, 83…
$ incidence_rate_age35to39 <dbl> 120.11556, 109.63501, 115.64300, 105.49105, 109.41353, 104.15840, 107.77…
$ incidence_rate_age40to44 <dbl> 180.68655, 175.00835, 182.20271, 173.77463, 177.48260, 174.45789, 188.76…
$ incidence_rate_age45to49 <dbl> 315.3393, 313.9819, 272.8921, 277.8396, 294.8799, 272.6133, 277.0107, 29…
$ incidence_rate_age50to54 <dbl> 550.5602, 495.5816, 491.0812, 492.6079, 494.7046, 477.3671, 488.7037, 46…
$ incidence_rate_age55to59 <dbl> 794.5084, 770.0979, 758.2115, 740.8172, 748.4284, 764.1191, 719.3507, 70…
$ incidence_rate_age60to64 <dbl> 1123.252, 1081.515, 1058.345, 1082.060, 1110.031, 1104.947, 1138.539, 11…
$ incidence_rate_age65to69 <dbl> 1660.507, 1519.326, 1516.068, 1469.991, 1448.638, 1486.236, 1517.106, 15…
$ incidence_rate_age70to74 <dbl> 2153.188, 2010.716, 2064.953, 2004.754, 1996.379, 1983.801, 1976.474, 20…
$ incidence_rate_age75to79 <dbl> 2547.724, 2525.157, 2382.327, 2363.602, 2418.506, 2398.319, 2453.372, 24…
$ incidence_rate_age80to84 <dbl> 2831.150, 2712.931, 2636.637, 2664.490, 2705.232, 2538.420, 2658.425, 26…
$ incidence_rate_age85to89 <dbl> 2833.216, 2724.356, 2537.016, 2897.986, 2651.394, 2706.661, 2534.022, 27…
$ incidence_rate_age90and_over <dbl> 2783.708, 2739.233, 2518.496, 2472.781, 2498.586, 2538.710, 2490.731, 26…
$ crude_rate <dbl> 535.2118, 515.3698, 507.4186, 509.6068, 516.9131, 517.2979, 528.4445, 53…
$ crude_rate_lower95pc_confidence_interval <dbl> 528.8762, 509.1478, 501.2411, 503.4128, 510.6692, 511.0525, 522.1331, 52…
$ crude_rate_upper95pc_confidence_interval <dbl> 541.6043, 521.6489, 513.6533, 515.8579, 523.2143, 523.6006, 534.8132, 54…
$ easr <dbl> 690.3965, 656.7481, 638.6931, 635.8372, 637.1422, 631.0517, 638.4489, 64…
$ easr_lower95pc_confidence_interval <dbl> 681.6498, 648.2928, 630.4566, 627.6781, 629.0608, 623.0758, 630.4726, 63…
$ easr_lower95pc_confidence_interval_qf <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ easr_upper95pc_confidence_interval <dbl> 699.1981, 665.2575, 646.9822, 644.0483, 645.2745, 639.0776, 646.4747, 65…
$ easr_upper95pc_confidence_interval_qf <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ wasr <dbl> 313.0067, 298.2655, 291.3470, 289.1945, 291.1166, 289.0455, 293.9213, 29…
$ wasr_lower95pc_confidence_interval <dbl> 309.0002, 294.3557, 287.5015, 285.3755, 287.2941, 285.2388, 290.0788, 28…
$ wasr_lower95pc_confidence_interval_qf <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ wasr_upper95pc_confidence_interval <dbl> 317.0387, 302.2006, 295.2177, 293.0385, 294.9641, 292.8770, 297.7887, 29…
$ wasr_upper95pc_confidence_interval_qf <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ standardised_incidence_ratio <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 10…
$ standardised_incidence_ratio_qf <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
hb_incidence <- read_csv("raw_data/incidence_by_health_board.csv") %>%
clean_names()
Rows: 47600 Columns: 24── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (9): HB, CancerSiteICD10Code, CancerSite, Sex, SexQF, EASRLower95pcConfidenceIntervalQF, EASRUpper95pcConfidenceIn...
dbl (15): _id, Year, IncidencesAllAges, CrudeRate, CrudeRateLower95pcConfidenceInterval, CrudeRateUpper95pcConfidenceIn...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hb_names <- read_csv("raw_data/geography_codes_and_labels_hb2014_01042019.csv") %>%
clean_names() %>%
select(hb, hb_name)
Rows: 18 Columns: 5── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): HB, HBName, Country
dbl (2): HBDateEnacted, HBDateArchived
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hb_incidence
hb_names
hb_incidence <- hb_incidence %>%
left_join(hb_names, "hb") %>%
filter(hb_name == "NHS Borders") %>%
select(-ends_with("_qf"))
The year on year incidence rates of cancer has trended upwards since 1996 reaching a peak in 2017, since 2017, however, there has been a significant downwards trend with the incidences in 2020 being the lowest since 2009, albeit this is presumably due to the covid pandemic where people were advised not to go to hospitals except in emergencies and many cancers may have gone un-diagnosed during this time period.
hb_incidence %>%
filter(cancer_site == "All cancer types" & sex == "All") %>%
select(id:year, crude_rate, easr, wasr) %>%
pivot_longer(c(crude_rate, easr, wasr), names_to = "type", values_to = "incidence_rate") %>%
ggplot(aes(x = year, y = incidence_rate, group = type, col = type)) +
geom_line()
NA
NA
hb_5yr <- read_csv("raw_data/5yr_summary_incidence_by_health_board.csv") %>%
clean_names() %>%
left_join(hb_names, "hb") %>%
select(id, hb_name, everything(), -hb, -ends_with("_qf")) %>%
filter(hb_name == "NHS Borders")
Rows: 1632 Columns: 60── Column specification ──────────────────────────────────────────────────────────────────
Delimiter: ","
chr (10): HB, CancerSiteICD10Code, CancerSite, Sex, SexQF, Year, EASRLower95pcConfiden...
dbl (50): _id, IncidencesAgeUnder5, IncidencesAge5To9, IncidencesAge10To14, Incidences...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top_5_cancers_sex <- hb_5yr %>%
filter(cancer_site != "All cancer types", sex == "All") %>%
slice_max(incidences_all_ages, n = 5) %>%
mutate(cancer_site = factor(cancer_site))
top_5_cancers_list <- top_5_cancers_sex %>%
pull(cancer_site)
top_5_cancers_sex %>%
ggplot(aes(x = reorder(cancer_site, sort(incidences_all_ages)),
y = incidences_all_ages, label = scales::comma(incidences_all_ages))) +
geom_col() +
geom_text(position = position_nudge(y = 75)) +
theme_classic() +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0, 0.1)),
labels = scales::comma) +
labs(x = "Cancer Type",
y = "Incidences",
title = "Top 5 most common cancers in the Scottish Borders",
subtitle = "2016 - 2020") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
age groups of total cancer incidences
top_5_list <- top_5_cancers_sex %>%
pull(cancer_site)
hb_5yr_ages <- hb_5yr %>%
filter(sex != "All") %>%
mutate(sex = factor(sex, c("Male", "Female"))) %>%
select(id:incidences_age85and_over) %>%
pivot_longer(incidences_age_under5:incidences_age85and_over,
names_to = "age_group", values_to = "incidences") %>%
mutate(age_group = str_remove(age_group, "incidences_age|"),
age_group = str_remove(age_group, "_"),
age_group = str_replace(age_group, "to", " - "),
age_group = str_replace(age_group, "under5", "Under 5"),
age_group = str_replace(age_group, "85andover", "85+"))
age_group_list <- hb_5yr_ages %>%
head(18) %>%
pull(age_group)
hb_5yr_ages %>%
mutate(age_group = factor(age_group, levels = age_group_list)) %>%
ggplot(aes(x = age_group, y = incidences)) +
geom_col() +
scale_y_continuous(expand = c(0, 0), labels = scales::comma) +
theme_classic()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Age Group",
y = "Total Incidences",
title = "Total number of cancer incidences in the Scottish Borders",
subtitle = "1996 to 2020")
hb_5yr_incidence_rates <- hb_5yr %>%
filter(sex != "All") %>%
mutate(sex = factor(sex, c("Male", "Female"))) %>%
pivot_longer(incidence_rate_age_under5:incidence_rate_age85and_over,
names_to = "age_group", values_to = "incidences") %>%
select(id:sex, age_group, incidences) %>%
mutate(age_group = str_remove(age_group, "incidence_rate_age|"),
age_group = str_remove(age_group, "_"),
age_group = str_replace(age_group, "to", " - "),
age_group = str_replace(age_group, "under5", "Under 5"),
age_group = str_replace(age_group, "85andover", "85+"))
hb_5yr_incidence_rates %>%
mutate(age_group = factor(age_group, levels = age_group_list)) %>%
ggplot(aes(x = age_group, y = incidences)) +
geom_col() +
scale_y_continuous(expand = c(0, 0), labels = scales::comma) +
theme_classic()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Age Group",
y = "Incidence Rate (per x,000)",
title = "Cancer incidence rates per age group in the Scottish Borders",
subtitle = "1996 to 2020")
sex of top 5 cancers
i.e. cancers which are higher than the national average and affect many people are priority
p <- hb_5yr %>%
select(cancer_site, sex, incidences_all_ages,
standardised_incidence_ratio:sir_upper95pc_confidence_interval) %>%
filter(sex == "All", incidences_all_ages > 0) %>%
mutate(standardised_incidence_ratio = round(standardised_incidence_ratio, 1)) %>%
rename(SIR = standardised_incidence_ratio,
Incidences = incidences_all_ages,
Type = cancer_site) %>%
ggplot(aes(label = Type, y = Incidences, x = SIR)) +
geom_point() +
xlim(0,200) +
ylim(0, 10000) +
scale_y_continuous(trans = "log10", name = "log(incidences)") +
theme_classic() +
geom_vline(aes(xintercept = 100)) +
geom_hline(aes(yintercept = 100)) +
theme(panel.background = element_rect(colour = "black")) +
labs(x = "SIR",
y = "Incidences",
title = "Cancer type prioritisation matrix",
subtitle = "NHS Borders data: 2016 - 2020")
Scale for 'y' is already present. Adding another scale for 'y', which will replace the
existing scale.
ggplotly(p,
tooltip = c("label", "y", "x")) %>%
config(displayModeBar = F)
NA